imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv


# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor

def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

데이터정리

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

(214520, 22)

df50 = down_sample_textbook(df02)
df50.shape

(12012, 22)

df50 = df50.reset_index()

N = len(df50)

autogluon1: amt

df50 = df50[["amt","is_fraud"]]
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")

No path specified. Models will be saved in: "AutogluonModels/ag-20231011_124800/"

C.적합(fit)

predictr.fit(tr)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_124800/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   746.82 GB / 982.82 GB (76.0%)
Train Data Rows:    9009
Train Data Columns: 1
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    14427.76 MB
    Train Data (Original)  Memory Usage: 0.07 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    0.0s = Fit runtime
    1 features in original data used to generate 1 features in processed data.
    Train Data (Processed) Memory Usage: 0.07 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.05s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
    0.8779   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist ...
    0.8635   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMXT ...
    0.8768   = Validation score   (accuracy)
    0.15s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.8923   = Validation score   (accuracy)
    0.23s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.8513   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.8513   = Validation score   (accuracy)
    0.32s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: CatBoost ...
    0.8946   = Validation score   (accuracy)
    0.64s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.8602   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.8579   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 1: early stopping
    0.8635   = Validation score   (accuracy)
    2.93s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: XGBoost ...
    0.8935   = Validation score   (accuracy)
    0.11s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: NeuralNetTorch ...
    0.8857   = Validation score   (accuracy)
    4.91s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMLarge ...
    0.8946   = Validation score   (accuracy)
    0.35s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.9023   = Validation score   (accuracy)
    0.48s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 11.38s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_124800/")

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f1ff7efc670>

predictr.leaderboard()

                  model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.902331       0.051067  7.458454                0.001309           0.513563            2       True         14
1         LightGBMLarge   0.894562       0.001894  0.414943                0.001894           0.414943            1       True         13
2              CatBoost   0.894562       0.001981  0.653966                0.001981           0.653966            1       True          7
3               XGBoost   0.893452       0.003316  0.114061                0.003316           0.114061            1       True         11
4              LightGBM   0.892342       0.003488  0.343734                0.003488           0.343734            1       True          4
5        NeuralNetTorch   0.885683       0.005610  5.186066                0.005610           5.186066            1       True         12
6        KNeighborsUnif   0.877913       0.006206  0.028794                0.006206           0.028794            1       True          1
7            LightGBMXT   0.876804       0.002243  0.245948                0.002243           0.245948            1       True          3
8        KNeighborsDist   0.863485       0.005649  0.024392                0.005649           0.024392            1       True          2
9       NeuralNetFastAI   0.863485       0.008246  2.861539                0.008246           2.861539            1       True         10
10       ExtraTreesGini   0.860155       0.029064  0.305516                0.029064           0.305516            1       True          8
11       ExtraTreesEntr   0.857936       0.029304  0.306215                0.029304           0.306215            1       True          9
12     RandomForestEntr   0.851276       0.028594  0.344763                0.028594           0.344763            1       True          6
13     RandomForestGini   0.851276       0.028716  0.321282                0.028716           0.321282            1       True          5

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.902331	0.051067	7.458454	0.001309	0.513563	2	True	14
1	LightGBMLarge	0.894562	0.001894	0.414943	0.001894	0.414943	1	True	13
2	CatBoost	0.894562	0.001981	0.653966	0.001981	0.653966	1	True	7
3	XGBoost	0.893452	0.003316	0.114061	0.003316	0.114061	1	True	11
4	LightGBM	0.892342	0.003488	0.343734	0.003488	0.343734	1	True	4
5	NeuralNetTorch	0.885683	0.005610	5.186066	0.005610	5.186066	1	True	12
6	KNeighborsUnif	0.877913	0.006206	0.028794	0.006206	0.028794	1	True	1
7	LightGBMXT	0.876804	0.002243	0.245948	0.002243	0.245948	1	True	3
8	KNeighborsDist	0.863485	0.005649	0.024392	0.005649	0.024392	1	True	2
9	NeuralNetFastAI	0.863485	0.008246	2.861539	0.008246	2.861539	1	True	10
10	ExtraTreesGini	0.860155	0.029064	0.305516	0.029064	0.305516	1	True	8
11	ExtraTreesEntr	0.857936	0.029304	0.306215	0.029304	0.306215	1	True	9
12	RandomForestEntr	0.851276	0.028594	0.344763	0.028594	0.344763	1	True	6
13	RandomForestGini	0.851276	0.028716	0.321282	0.028716	0.321282	1	True	5

autogluon2: amt, distace

df50 = df50[["amt","distance_km", "is_fraud"]]

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")

No path specified. Models will be saved in: "AutogluonModels/ag-20231011_125208/"

C.적합(fit)

predictr.fit(tr)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_125208/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   746.64 GB / 982.82 GB (76.0%)
Train Data Rows:    9009
Train Data Columns: 2
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    14301.5 MB
    Train Data (Original)  Memory Usage: 0.14 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
    0.0s = Fit runtime
    2 features in original data used to generate 2 features in processed data.
    Train Data (Processed) Memory Usage: 0.14 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
    0.8646   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist ...
    0.8535   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMXT ...
    0.8879   = Validation score   (accuracy)
    0.33s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBM ...
    0.8912   = Validation score   (accuracy)
    0.22s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.8701   = Validation score   (accuracy)
    0.32s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.8735   = Validation score   (accuracy)
    0.35s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: CatBoost ...
    0.899    = Validation score   (accuracy)
    0.51s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.8613   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.8646   = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 1: early stopping
    0.8624   = Validation score   (accuracy)
    2.94s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: XGBoost ...
    0.889    = Validation score   (accuracy)
    0.18s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: NeuralNetTorch ...
    0.8857   = Validation score   (accuracy)
    4.49s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.8779   = Validation score   (accuracy)
    0.38s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.899    = Validation score   (accuracy)
    0.5s     = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 11.21s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_125208/")

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f201b90b790>

predictr.leaderboard()

                  model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0              CatBoost   0.899001       0.001843  0.508956                0.001843           0.508956            1       True          7
1   WeightedEnsemble_L2   0.899001       0.003138  1.012600                0.001295           0.503644            2       True         14
2              LightGBM   0.891232       0.002272  0.223588                0.002272           0.223588            1       True          4
3               XGBoost   0.889012       0.003450  0.180574                0.003450           0.180574            1       True         11
4            LightGBMXT   0.887902       0.006675  0.331872                0.006675           0.331872            1       True          3
5        NeuralNetTorch   0.885683       0.005594  4.485880                0.005594           4.485880            1       True         12
6         LightGBMLarge   0.877913       0.001962  0.382782                0.001962           0.382782            1       True         13
7      RandomForestEntr   0.873474       0.030227  0.347987                0.030227           0.347987            1       True          6
8      RandomForestGini   0.870144       0.028919  0.315052                0.028919           0.315052            1       True          5
9        KNeighborsUnif   0.864595       0.006738  0.009886                0.006738           0.009886            1       True          1
10       ExtraTreesEntr   0.864595       0.029063  0.283869                0.029063           0.283869            1       True          9
11      NeuralNetFastAI   0.862375       0.011110  2.937038                0.011110           2.937038            1       True         10
12       ExtraTreesGini   0.861265       0.028793  0.284426                0.028793           0.284426            1       True          8
13       KNeighborsDist   0.853496       0.004887  0.004544                0.004887           0.004544            1       True          2

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	CatBoost	0.899001	0.001843	0.508956	0.001843	0.508956	1	True	7
1	WeightedEnsemble_L2	0.899001	0.003138	1.012600	0.001295	0.503644	2	True	14
2	LightGBM	0.891232	0.002272	0.223588	0.002272	0.223588	1	True	4
3	XGBoost	0.889012	0.003450	0.180574	0.003450	0.180574	1	True	11
4	LightGBMXT	0.887902	0.006675	0.331872	0.006675	0.331872	1	True	3
5	NeuralNetTorch	0.885683	0.005594	4.485880	0.005594	4.485880	1	True	12
6	LightGBMLarge	0.877913	0.001962	0.382782	0.001962	0.382782	1	True	13
7	RandomForestEntr	0.873474	0.030227	0.347987	0.030227	0.347987	1	True	6
8	RandomForestGini	0.870144	0.028919	0.315052	0.028919	0.315052	1	True	5
9	KNeighborsUnif	0.864595	0.006738	0.009886	0.006738	0.009886	1	True	1
10	ExtraTreesEntr	0.864595	0.029063	0.283869	0.029063	0.283869	1	True	9
11	NeuralNetFastAI	0.862375	0.011110	2.937038	0.011110	2.937038	1	True	10
12	ExtraTreesGini	0.861265	0.028793	0.284426	0.028793	0.284426	1	True	8
13	KNeighborsDist	0.853496	0.004887	0.004544	0.004887	0.004544	1	True	2

autogluon3: amt, time, distace

df50['trans_date_trans_time'] = pd.to_datetime(df50['trans_date_trans_time'])
df50['trans_date_trans_time'] = (df50['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

category_map = {category: index for index, category in enumerate(df50['category'].unique())}
df50['category'] = df50['category'].map(category_map)

def haversine(lat1, lon1, lat2, lon2):
    # 지구의 반지름 (미터)
    radius = 6371.0

    # 라디안으로 변환
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Haversine 공식 계산
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = radius * c

    return distance

customer_lat = df50['lat']
customer_lon = df50['long']
store_lat = df50['merch_lat']
store_lon = df50['merch_long']
distances = haversine(customer_lat, customer_lon, store_lat, store_lon)
df50['distance_km'] = distances

df50 = df50[["amt",'trans_date_trans_time', 'distance_km', "is_fraud"]]

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")

No path specified. Models will be saved in: "AutogluonModels/ag-20231011_124455/"

C.적합(fit)

predictr.fit(tr)

Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_124455/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   747.06 GB / 982.82 GB (76.0%)
Train Data Rows:    9009
Train Data Columns: 3
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    14638.47 MB
    Train Data (Original)  Memory Usage: 0.22 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
        ('int', [])   : 1 | ['trans_date_trans_time']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
        ('int', [])   : 1 | ['trans_date_trans_time']
    0.0s = Fit runtime
    3 features in original data used to generate 3 features in processed data.
    Train Data (Processed) Memory Usage: 0.22 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
    0.727    = Validation score   (accuracy)
    0.02s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist ...
    0.7236   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBMXT ...
    0.8812   = Validation score   (accuracy)
    0.27s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    0.8912   = Validation score   (accuracy)
    0.19s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    0.8757   = Validation score   (accuracy)
    0.33s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.8835   = Validation score   (accuracy)
    0.36s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: CatBoost ...
    0.8923   = Validation score   (accuracy)
    0.89s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    0.8701   = Validation score   (accuracy)
    0.29s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.8724   = Validation score   (accuracy)
    0.3s     = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
    0.8602   = Validation score   (accuracy)
    3.48s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: XGBoost ...
    0.8923   = Validation score   (accuracy)
    0.14s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: NeuralNetTorch ...
    0.8746   = Validation score   (accuracy)
    3.71s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.8768   = Validation score   (accuracy)
    0.35s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.9279   = Validation score   (accuracy)
    0.5s     = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 11.23s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_124455/")

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f205bd3bc70>

predictr.leaderboard()

                  model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.927858       0.034292  9.131091                0.001279           0.502710            2       True         14
1              CatBoost   0.892342       0.002311  0.885833                0.002311           0.885833            1       True          7
2               XGBoost   0.892342       0.003765  0.143192                0.003765           0.143192            1       True         11
3              LightGBM   0.891232       0.002339  0.189846                0.002339           0.189846            1       True          4
4      RandomForestEntr   0.883463       0.029521  0.356271                0.029521           0.356271            1       True          6
5            LightGBMXT   0.881243       0.003287  0.271952                0.003287           0.271952            1       True          3
6         LightGBMLarge   0.876804       0.001867  0.351746                0.001867           0.351746            1       True         13
7      RandomForestGini   0.875694       0.030285  0.330684                0.030285           0.330684            1       True          5
8        NeuralNetTorch   0.874584       0.005663  3.705228                0.005663           3.705228            1       True         12
9        ExtraTreesEntr   0.872364       0.030204  0.300827                0.030204           0.300827            1       True          9
10       ExtraTreesGini   0.870144       0.029252  0.293159                0.029252           0.293159            1       True          8
11      NeuralNetFastAI   0.860155       0.008793  3.475210                0.008793           3.475210            1       True         10
12       KNeighborsUnif   0.726970       0.007053  0.015347                0.007053           0.015347            1       True          1
13       KNeighborsDist   0.723640       0.004987  0.005171                0.004987           0.005171            1       True          2

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.927858	0.034292	9.131091	0.001279	0.502710	2	True	14
1	CatBoost	0.892342	0.002311	0.885833	0.002311	0.885833	1	True	7
2	XGBoost	0.892342	0.003765	0.143192	0.003765	0.143192	1	True	11
3	LightGBM	0.891232	0.002339	0.189846	0.002339	0.189846	1	True	4
4	RandomForestEntr	0.883463	0.029521	0.356271	0.029521	0.356271	1	True	6
5	LightGBMXT	0.881243	0.003287	0.271952	0.003287	0.271952	1	True	3
6	LightGBMLarge	0.876804	0.001867	0.351746	0.001867	0.351746	1	True	13
7	RandomForestGini	0.875694	0.030285	0.330684	0.030285	0.330684	1	True	5
8	NeuralNetTorch	0.874584	0.005663	3.705228	0.005663	3.705228	1	True	12
9	ExtraTreesEntr	0.872364	0.030204	0.300827	0.030204	0.300827	1	True	9
10	ExtraTreesGini	0.870144	0.029252	0.293159	0.029252	0.293159	1	True	8
11	NeuralNetFastAI	0.860155	0.008793	3.475210	0.008793	3.475210	1	True	10
12	KNeighborsUnif	0.726970	0.007053	0.015347	0.007053	0.015347	1	True	1
13	KNeighborsDist	0.723640	0.004987	0.005171	0.004987	0.005171	1	True	2